In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline

np.random.seed(42)
In [2]:
df = pd.read_csv('classroom_actions.csv')
df.head()
Out[2]:
timestamp id group total_days completed
0 2015-08-10 17:06:01.032740 610019 experiment 97 True
1 2015-08-10 17:15:28.950975 690224 control 75 False
2 2015-08-10 17:34:40.920384 564994 experiment 128 True
3 2015-08-10 17:50:39.847374 849588 experiment 66 False
4 2015-08-10 19:10:40.650599 849826 experiment 34 False
In [10]:
# The total_days represents the total amount of time
# each student has spent in classroom.
# get the average classroom time for control group
df.drop_duplicates(subset=['id'], inplace=True)
control_mean = df.query('group=="control"').total_days.mean()

# get the average classroom time for experiment group
experiment_mean = df.query('group=="experiment"').total_days.mean()

# display average classroom time for each group
control_mean, experiment_mean
Out[10]:
(73.368990384615387, 74.671593533487297)
In [11]:
# compute observed difference in classroom time
obs_diff = experiment_mean - control_mean

# display observed difference
obs_diff
Out[11]:
1.3026031488719099
In [13]:
# create sampling distribution of difference in average classroom times
# with boostrapping
diffs = []
for _ in range(10000):
    sample = df.sample(len(df), replace=True) 
    sample_control_mean = sample.query('group=="control"').total_days.mean() 
    sample_experiment_mean = sample.query('group== "experiment"').total_days.mean() 
    diffs.append(sample_experiment_mean - sample_control_mean)
    
In [14]:
# convert to numpy array
diffs = np.array(diffs) 
In [15]:
# plot sampling distribution
plt.hist(diffs) 
Out[15]:
(array([    5.,    42.,   332.,  1241.,  2587.,  2966.,  1887.,   759.,
          159.,    22.]),
 array([ -1.71018109e+00,  -1.14129540e+00,  -5.72409700e-01,
         -3.52400341e-03,   5.65361693e-01,   1.13424739e+00,
          1.70313309e+00,   2.27201878e+00,   2.84090448e+00,
          3.40979018e+00,   3.97867587e+00]),
 <a list of 10 Patch objects>)
In [16]:
# simulate distribution under the null hypothesis
null_vals =np.random.normal(0, diffs.std(), len(diffs)) 
In [22]:
# plot null distribution
plt.hist(null_vals)

# plot line for observed statistic
plt.axvline(diffs.mean(), color='r')
Out[22]:
<matplotlib.lines.Line2D at 0x7fce88fa3cf8>
In [18]:
# compute p value
(null_vals > diffs.mean()).mean()
Out[18]:
0.039600000000000003
In [ ]: